Load data

cafe = read_csv(here::here("data/Sidewalk_Caf__Licenses_and_Applications_clean.csv"))
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   zip = col_double(),
##   swc_sq_ft = col_double(),
##   swc_tables = col_double(),
##   swc_chairs = col_double(),
##   lat = col_double(),
##   long = col_double(),
##   community_district = col_double(),
##   city_council_district = col_double(),
##   app_sq_ft = col_double(),
##   app_tables = col_double(),
##   app_chairs = col_double(),
##   app_status_date = col_datetime(format = ""),
##   expiration_date = col_datetime(format = ""),
##   app_too_date = col_datetime(format = ""),
##   submit_date = col_datetime(format = ""),
##   intake_dd = col_datetime(format = ""),
##   send_package_dd = col_datetime(format = ""),
##   cp_dd = col_datetime(format = ""),
##   cb_dd = col_datetime(format = ""),
##   hearing_dd = col_datetime(format = "")
##   # ... with 4 more columns
## )
## See spec(...) for full column specifications.
parking = read_csv(
  here::here("data/parking_vio2021_cleanv1.csv")) %>% 
  filter(hour != 12.3) 
## Parsed with column specification:
## cols(
##   id = col_double(),
##   summons_number = col_double(),
##   registration_state = col_character(),
##   issue_date = col_datetime(format = ""),
##   violation_code = col_double(),
##   vehicle_make = col_character(),
##   hour = col_double(),
##   min = col_double(),
##   violation_county = col_character(),
##   house_number = col_character(),
##   street_name = col_character(),
##   intersecting_street = col_character(),
##   vehicle_color = col_character(),
##   vehicle_year = col_double(),
##   address = col_character(),
##   long = col_double(),
##   lat = col_double(),
##   borough = col_character(),
##   fine_amount = col_double()
## )

Cafe Map

plot_cafe_map =
  parking %>%
  count(street_name, name = "ticket") %>%
  right_join(cafe) %>%
  mutate(ticket = replace_na(ticket,1e-10))
## Joining, by = "street_name"
pal = colorNumeric(palette = c("viridis", "magma", "inferno","plasma")[[4]],
                   domain = plot_cafe_map$ticket %>% log())

plot_cafe_map =
  plot_cafe_map%>%
  mutate(pop =
           str_c("<b>",business_name,"</b><br>",round(ticket)," tickets")) %>% 
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    ~ long,
    ~ lat,
    color = ~pal(ticket %>% log()),
    radius = .1,
    popup = ~ (pop)
  )

plot_cafe_map

##Geo vs ticket count

parking %>% 
  drop_na(address) %>% 
  group_by(borough, lat, long) %>% 
  summarize(tickets_count = n()) %>% 
  mutate(text_label = str_c("Tickets count:", tickets_count, "Borough:", borough)) %>% 
  plot_ly(x = ~lat, y = ~long, z = ~tickets_count, text = ~ text_label) %>%
  add_markers(color = ~tickets_count,
              alpha = 0.0001) 
## `summarise()` regrouping output by 'borough', 'lat' (override with `.groups` argument)
parking %>%
  drop_na(address) %>% 
  dplyr::select(long, lat, hour) %>%
  mutate(long = abs(long)) %>%
  drop_na() %>%
  with(., MASS::kde2d(lat, long)) %>%
  as_tibble() %>%
  plot_ly(x = ~ x, y = ~ y, z = ~ z) %>%
  add_surface() %>%
  layout(scene = list(
    yaixs = list(autorange = "reversed"),
    zaxis = list(
      range = c(1, 150),
      title = "",
      showline = FALSE,
      showticklabels = FALSE,
      showline = FALSE,
      showgrid = FALSE
    ),
    camera = list(eye = list(
      x = -1.25, y = 1.25, z = 1.25
    )),
    showlegend = FALSE
  ))
parking %>%
  ggplot(aes(x = long, y = lat)) +
  geom_hex() +
  theme(
    legend.position = "right") +
  labs(
    title = "tickets counts show in georaphic information") 
## Warning: Removed 246 rows containing non-finite values (stat_binhex).

Violation vs Time

1.The distribution of tickets quantities over year

parking %>% 
  mutate(month = lubridate::month(issue_date),
         month = forcats::fct_reorder(as.factor(month),month)
         ) %>% 
  drop_na(month,borough) %>% 
  group_by(borough,month) %>% 
  summarize(tickets = n()) %>% 
  ungroup() %>%
  mutate(borough = forcats::fct_reorder(borough,tickets,sum)) %>% 
  ggplot(aes(x = month, y = tickets, fill = borough,group = borough)) + 
    geom_bar(stat = "identity",position = "dodge") +
    labs(
      x = 'Issuing Month',
      y = 'counts',
      title = 'The distribution of tickets quantities over month ')
## `summarise()` regrouping output by 'borough' (override with `.groups` argument)

Conclusion: This dataset has nearly 39 million observations, most of which are recorded during June, July, August, and September.

2.plots: violation vs weekday

parking_day = parking %>%
  dplyr::select(issue_date, summons_number, borough) %>% 
  drop_na(borough)

day_order = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

parking_day %>% 
  mutate(day_week = weekdays(issue_date)) %>%
  group_by(day_week, borough) %>% 
  summarize(n = n()) %>%
  mutate(day_week = factor(day_week, levels = day_order)) %>% 
  arrange(day_week) %>% 
  plot_ly(x = ~ day_week, y = ~ n, type = 'scatter', mode = "line", color = ~borough) %>%
  layout(
    title = 'Violations by weekday',
    xaxis = list(
      type = 'category',
      title = 'Weekday'),
    yaxis = list(
      title = 'Count of violations'))
## `summarise()` regrouping output by 'day_week' (override with `.groups` argument)
## Warning: Ignoring 3 observations

Conclusion: The volume of parking tickets issuing during weekdays are larger than volume of those issuing during weekends. This could be attibuted to two reasons:
1. the overall demand for parking in NYC are smaller during weekend than during weekdays 2. Many parking spots are designed to be free parking during weekends.

All the five boroughs in NYC–Manhattan, Bronx, Brooklyn, Queens and Staten Island have the same trend in the term of parking tickets generating. Manhattan always has the highest parking tickets volume, and follows with Brooklyn, Queens, Bronx and Staten Island.

3.vilation vs hour

parking %>% 
  group_by(hour) %>% 
  summarize(n = n()) %>%
  plot_ly(x = ~hour, y = ~n, type = 'scatter',mode = 'line') %>%
  layout(
    title = 'Violations per Hour',
    xaxis = list(
      type = 'category',
      title = 'Hour',
      range = c(0, 23)),
    yaxis = list(
      title = 'Count of violations'))
## `summarise()` ungrouping output (override with `.groups` argument)
parking %>% 
  filter(hour < 23) %>% 
  drop_na(borough) %>% 
  ggplot(aes(x = hour, fill = borough)) +
  geom_bar(stat = "count") +
  labs(title = 'Violations per Hour for each borough') +
  ylab('Count of tickets') +
  scale_x_continuous(breaks=seq(0, 23, 1))

During a day, the parking tickets generating has two peaks– around 8:00 am and aroung 1:00 pm. This may caused by the high parking demand especially during meal time.

vio_code = readxl::read_xlsx('data/ParkingViolationCodes_January2020.xlsx') %>% 
  janitor::clean_names() %>% 
  dplyr::select(violation_code, violation_description) %>% 
  mutate(violation_description = str_to_lower(violation_description))

vio_count_boro = 
  parking %>% 
  left_join(vio_code, by = 'violation_code')


vio_count_boro %>% 
  group_by(hour,borough) %>% 
  summarise(n = n()) %>% 
  plot_ly(x = ~hour, y = ~n, type = 'scatter',mode = 'line', color= ~ borough) %>%
  layout(
    title = 'Violations per Hour',
    xaxis = list(
      type = 'category',
      title = 'Hour',
      range = c(0, 23)),
    yaxis = list(
      title = 'Count of violations'))
## `summarise()` regrouping output by 'hour' (override with `.groups` argument)

##Overall violation type

violation_count = 
  parking %>% 
  group_by(violation_code) %>% 
  summarise(n = n()) %>% 
  left_join(vio_code, by = 'violation_code') %>% 
  arrange(desc(n)) %>% 
  filter(n > 10000)
## `summarise()` ungrouping output (override with `.groups` argument)

bar_plot:

violation_count %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) 

The top ten

##violation type by Borough

Manhattan_vio = 
  vio_count_boro %>% 
  filter(borough == 'Manhattan') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Manhattan') +
    xlab("violation description") +
    ylab('Count of tickets')  
## `summarise()` ungrouping output (override with `.groups` argument)
Queens_vio = 
  vio_count_boro %>% 
  filter(borough == 'Queens') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Queens') +
    xlab("violation description") +
    ylab('Count of tickets') 
## `summarise()` ungrouping output (override with `.groups` argument)
Bronx_vio = 
  vio_count_boro %>% 
  filter(borough == 'Bronx') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Bronx') +
    xlab("violation description") +
    ylab('Count of tickets')  
## `summarise()` ungrouping output (override with `.groups` argument)
Brooklyn_vio = 
  vio_count_boro %>% 
  filter(borough == 'Brooklyn') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Brooklyn') +
    xlab("violation description") +
    ylab('Count of tickets') 
## `summarise()` ungrouping output (override with `.groups` argument)
Staten_vio = 
  vio_count_boro %>% 
  filter(borough == 'Staten Island') %>% 
  group_by(violation_description) %>% 
  summarize(n = n()) %>% 
  mutate(violation_description = fct_reorder(violation_description, n)) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot(aes(x = violation_description, y = n, fill = violation_description)) +
  geom_bar(stat = "identity") + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = 'Top 10 violations in Staten Island') +
    xlab("violation description") +
    ylab('Count of tickets')  
## `summarise()` ungrouping output (override with `.groups` argument)
Manhattan_vio

Queens_vio 

Bronx_vio

Brooklyn_vio

Staten_vio

Anything to do with veihicle_make?

parking %>% 
  select(id, vehicle_make) %>% 
  group_by(vehicle_make) %>% 
  summarise(n = n()) %>% 
  filter(n>10) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  mutate(vehicle_make = fct_reorder(vehicle_make, n)) %>% 
  plot_ly(x = ~vehicle_make, y = ~n, type = "bar", colors = "viridis")
## `summarise()` ungrouping output (override with `.groups` argument)

According to the distribution of vehicle make of the parking ticket dataset, we can roughly know the tops 10 vehicle make that are most popular among new yorkers.

Fine Amount

 As we were motivated by the trouble that New Yorker can get an expensive coffee taxed by the Parking ticket, we need to look at the fine amounts. The Manhattan has the highest mean and the widest variance. Queens and Brooklyn at the first look seems like to have similar ditributions. Further analysis is needed to see if parking violation in Queens may receive a similar fine amount in Brooklyn and etc.

boro_amount = parking %>% 
  drop_na(fine_amount, borough)%>%
  select(borough, fine_amount)%>%
  group_by(borough) 

parking %>% 
  drop_na(fine_amount, borough)%>%
  select(borough, fine_amount)%>%
  group_by(borough) %>% 
  summarize(mean = mean(fine_amount),
            standard_error = sd(fine_amount)) %>% 
  knitr::kable(caption = "Fine Amount in borough")
## `summarise()` ungrouping output (override with `.groups` argument)
Fine Amount in borough
borough mean standard_error
Bronx 65.2 29.8
Brooklyn 62.8 27.8
Manhattan 68.1 30.6
Queens 58.8 26.4
Staten Island 66.0 28.5
boro_amount %>%
  ggplot(aes(x = borough, y = fine_amount, fill = borough)) +
  geom_boxplot() +
  ylim(0,200)+
  theme(
    legend.position = "right",
    legend.text = element_text(size = 8)
  ) 
## Warning: Removed 2188 rows containing non-finite values (stat_boxplot).

 Total amount of fine are similar across weekdays in all five boroughs, and see a steep drop on weekend on all borough. Across all boroughs, Manhattan has the highest total fine, Staten Island has the lowest total fine. We see that the proportion of total fine in a day doesn’t change across a week, we need further analysis to see if Monday in Brooklyn have higher total fine than any other day and etc.

day_order = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

parking %>%
  dplyr::select(issue_date, fine_amount, borough) %>% 
  drop_na(issue_date,fine_amount) %>% 
  mutate(day_week = weekdays(issue_date),
         day_week = forcats::fct_relevel(day_week,day_order),
         borough = fct_reorder(.f = borough,
                               .x = fine_amount,
                                .fun = sum)) %>% 
  group_by(day_week,borough) %>% 
  summarise(fine_amount = sum(fine_amount,na.rm = T)) %>% 
  plot_ly() %>% 
  add_bars(
    x = ~day_week,
    y = ~fine_amount,
    color = ~borough,
    colors = "viridis"
  ) %>% 
  layout(
    yaxis = list(type = "log"),
    title = "Fine amount"
  )
## `summarise()` regrouping output by 'day_week' (override with `.groups` argument)
  1. fine amount vs weekday faceted by borough
parking %>%
  dplyr::select(issue_date, fine_amount, borough) %>% 
  drop_na(issue_date, borough) %>% 
  mutate(day_week = weekdays(issue_date),
         day_week = factor(day_week, levels = day_order)) %>% 
  arrange(day_week) %>% 
  ggplot(aes(x = day_week, y = fine_amount, fill = day_week)) +
  geom_boxplot() +
  facet_grid(~borough) +
  ylim(0,200) +
  theme(
    legend.position = "right",
    legend.text = element_text(size = 8),
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1)
  ) 
## Warning: Removed 2738 rows containing non-finite values (stat_boxplot).

  1. average fine amount distribution
nearby_data %>% 
  drop_na(borough) %>% 
  group_by(borough, business_name) %>% 
  summarize(mean_fine = mean(fine_amount)) %>% 
  ggplot(aes(x = mean_fine, fill = borough)) +
  geom_density(alpha = .5)
## `summarise()` regrouping output by 'borough' (override with `.groups` argument)
## Warning: Removed 58 rows containing non-finite values (stat_density).